First we need to import the necessary modules.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
from beeswarm import beeswarm
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind
from matplotlib.patches import Ellipse,Patch,Arrow
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('png')
from IPython.display import HTML, display
import tabulate
from decimal import Decimal
import warnings
warnings.filterwarnings('ignore') #in order to suppress warnings output of t-test if one sample is 0
pd.set_option('display.max_rows', 1000)
The we read sample files and extract separate data containers for the 4 tissues in question. The data have been normalized.
sampleinfo=pd.read_csv('sample_info_consensus.csv',sep=',')
sampleinfo=sampleinfo.query('paper == "fromm" or paper == "schee" or paper == "selitsky" or paper == "neerincx"')
countmat=pd.read_csv('consensus.rpm_uniq_seqs_correct_names.csv',sep=',')
countmat.rename(columns = {'Unnamed: 0':'mirna'}, inplace = True)
liver_normal=countmat[[row[0] for row in sampleinfo.values if row[3]=='liver' and row[2]=='normal']]
liver_meta=countmat[[row[0] for row in sampleinfo.values if row[3]=='liver' and row[2]=='metastasis']]
crc_normal=countmat[[row[0] for row in sampleinfo.values if row[3]=='colorect' and row[2]=='normal' and row[4]!='schee']]
crc_tumor=countmat[[row[0] for row in sampleinfo.values if row[3]=='colorect' and row[2]=='tumor']]
liver_normal.index=liver_meta.index=crc_normal.index=crc_tumor.index=list(countmat['mirna'])
For each miRNA, certain values are calculated such as t-test relevance values between different tissue types, fold changes between average values and count of samples. This is later used to filter the data. Also, miRNAs that are known to be tissue-related are excluded.
mirnanamelist=[]
mirnadatalist=[]
tissuemirna=["Hsa-Mir-122_5p ","Hsa-Mir-126_5p ","Hsa-Mir-126_3p ","Hsa-Mir-144_5p ","Hsa-Mir-144_3p ","Hsa-Mir-486_5p ",
"Hsa-Mir-143_3p ","Hsa-Mir-145_5p ","Hsa-Mir-150_5p ","Hsa-Mir-142-P1_5p ","Hsa-Mir-223_3p "]
for i in range(len(countmat.values)):
s1,p1=ttest_ind(crc_normal.values[i],crc_tumor.values[i])
s2,p2=ttest_ind(crc_tumor.values[i],liver_meta.values[i])
s3,p3=ttest_ind(crc_normal.values[i],liver_normal.values[i])
s4,p4=ttest_ind(liver_meta.values[i],liver_normal.values[i])
fc_cm_cb=np.mean(crc_tumor.values[i])/np.mean(crc_normal.values[i])
if fc_cm_cb<1:fc_cm_cb=-1/(fc_cm_cb+0.00001)
fc_lm_cm=np.mean(liver_meta.values[i])/np.mean(crc_tumor.values[i])
if fc_lm_cm<1:fc_lm_cm=-1/(fc_lm_cm+0.00001)
fc_lb_cb=np.mean(liver_normal.values[i])/np.mean(crc_normal.values[i])
if fc_lb_cb<1:fc_lb_cb=-1/(fc_lb_cb+0.00001)
fc_lb_lm=np.mean(liver_normal.values[i])/np.mean(liver_meta.values[i])
if fc_lb_lm<1:fc_lb_lm=-1/(fc_lb_lm+0.00001)
n_cb=len(crc_normal.values[i])
n_cm=len(crc_tumor.values[i])
n_lb=len(liver_normal.values[i])
n_lm=len(liver_meta.values[i])
totcounts=np.mean(liver_normal.values[i])+np.mean(liver_meta.values[i])+np.mean(crc_normal.values[i])+np.mean(crc_tumor.values[i])
tm=False
if countmat['mirna'][i] in tissuemirna:tm=True
mirnadatalist.append([p1,p2,p3,p4,fc_cm_cb,fc_lm_cm,fc_lb_cb,fc_lb_lm,n_cb,n_cm,n_lb,n_lm,totcounts,tm])
mirnanamelist.append(countmat['mirna'][i])
mirnalist=pd.DataFrame(data=mirnadatalist,index=mirnanamelist,columns=[
'p_cm_cb',
'p_cm_lm',
'p_cb_lb',
'p_lm_lb',
'fc_cm_cb',
'fc_lm_cm',
'fc_lb_cb',
'fc_lb_lm',
'N_cb',
'N_cm',
'N_lb',
'N_lm',
'total_counts',
'tissue_mirna'])
This is what it looks like for one example miRNA.
mirnalist.T['Hsa-Mir-122_5p ']
Next we define the visualization function called "combplot". The individual experimental miRNA counts are drawn as vertical (liver) and horizontal (colon) lines. The area where they overlap is indicated by an ellipse with the means as center and the standard deviation as half axes. This is done both for benign and malign tissues.
The straight green line indicates the equal count where colon and liver counts are the same (FC=1). An arrow indicates the trend from benign to malign. If the ellipse is top left it means the expression for this miRNA is higher in the colon tissue than in the liver tissue while it is lower if the ellipse is in the bottom right.
Additionally, a classical box plot of the four tissues is shown and overlaid with a beeswarm plot that shows individual measurements.
def combplot(mirna):
#prepare figure
fig = plt.figure(figsize=(13,7))
fig.suptitle(mirna,fontsize=20)
#prepare ellipses subplot
ax = fig.add_axes([0.1,0.1,0.44,0.44*13/7])
plt.xlabel('Individual counts (liver)')
plt.ylabel('Individual counts (colon)')
plotlimitmax=max(liver_normal.T[mirna].max(),crc_normal.T[mirna].max(),liver_meta.T[mirna].max(),crc_tumor.T[mirna].max())
plotlimitmin=min(liver_normal.T[mirna].min(),crc_normal.T[mirna].min(),liver_meta.T[mirna].min(),crc_tumor.T[mirna].min())
ax.set_xlim(plotlimitmin, plotlimitmax)
ax.set_ylim(plotlimitmin, plotlimitmax)
#ax.set_aspect('equal')
#prepare ellipses
xn=liver_normal.T[mirna].mean()
yn=crc_normal.T[mirna].mean()
xerrn=2*liver_normal.T[mirna].std()
yerrn=2*crc_normal.T[mirna].std()
xc=liver_meta.T[mirna].mean()
yc=crc_tumor.T[mirna].mean()
xerrc=2*liver_meta.T[mirna].std()
yerrc=2*crc_tumor.T[mirna].std()
ne=Ellipse(xy=(xn,yn), width=xerrn, height=yerrn,color='blue',lw=2,fill=False,alpha=1,label='Benign (Median+St.d.)')
ce=Ellipse(xy=(xc,yc), width=xerrc, height=yerrc,color='red',lw=2,fill=False,alpha=1,label='Malign (Media+St.d.)')
ax.add_artist(ne)
ax.add_artist(ce)
ne.set_zorder(10000)
ce.set_zorder(10000)
#draw arrow between ellipses centers
ar=Arrow(xn,yn,xc-xn,yc-yn,width=plotlimitmax/40,zorder=10001,color='purple')
ax.add_artist(ar)
#draw green line of constant fold change
gfc1,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,1*plotlimitmax],'g',label="FC=1")
gfc2,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,2*plotlimitmax],'g--',label="FC=2")
gfc05,=ax.plot([plotlimitmin,plotlimitmax],[plotlimitmin,0.5*plotlimitmax],'g-.',label="FC=-2")
#draw vertical and horizontal lines for each individual measurement
lw,al=10,0.07
nleg=Patch(color='black',alpha=al*5,label='Benign (ind. meas.)')
cleg=Patch(color='yellow',alpha=al*5,label='Malign (ind. meas.)')
ax.legend(handles=[ne,ce,nleg,cleg,gfc1,gfc2,gfc05],fontsize='small')
for xln in liver_normal.T[mirna].values:
ax.plot([xln,xln],[min(crc_normal.T[mirna].values),max(crc_normal.T[mirna].values)],'k-',linewidth=lw,alpha=al)
for ycn in crc_normal.T[mirna].values:
ax.plot([min(liver_normal.T[mirna].values),max(liver_normal.T[mirna].values)],[ycn,ycn],'k-',linewidth=lw,alpha=al)
for xlc in liver_meta.T[mirna].values:
ax.plot([xlc,xlc],[min(crc_tumor.T[mirna].values),max(crc_tumor.T[mirna].values)],'y-',linewidth=lw,alpha=al)
for ycc in crc_tumor.T[mirna].values:
ax.plot([min(liver_meta.T[mirna].values),max(liver_meta.T[mirna].values)],[ycc,ycc],'y-',linewidth=lw,alpha=al)
#prepare subplot for boxplot and beeswarm plot
ax2=fig.add_axes([0.61, 0.3, 0.35, 0.615])
ax2.boxplot([crc_normal.T[mirna].values,
crc_tumor.T[mirna].values,
liver_meta.T[mirna].values,
liver_normal.T[mirna].values],showmeans=True,meanline=True,
labels=['Colon benign\n N='+str(int(mirnalist.T[mirna][8])),
'Colon malign\n N='+str(int(mirnalist.T[mirna][9])),
'Liver malign\n N='+str(int(mirnalist.T[mirna][11])),
'Liver benign\n N='+str(int(mirnalist.T[mirna][10]))])
def GetColor(x):
colors = []
for item in x.index:
coh=sampleinfo['paper'][sampleinfo['sample']==item].values[0]
if coh == 'fromm': colors.append('red')
if coh == 'schee':colors.append('blue')
if coh == 'neerincx':colors.append('green')
if coh == 'selitsky':colors.append('yellow')
colors=np.array(colors)
dat=x.values
inds = dat.argsort()
sorteddatcol = colors[inds]
colors=sorteddatcol.tolist()
return colors
colors = (GetColor(crc_normal.T[mirna]) + GetColor(crc_tumor.T[mirna]) + GetColor(liver_meta.T[mirna]) + GetColor(liver_normal.T[mirna]))
beeswarm([np.copy(crc_normal.T[mirna].values),
np.copy(crc_tumor.T[mirna].values),
np.copy(liver_meta.T[mirna].values),
np.copy(liver_normal.T[mirna].values)],
ax=ax2,s=10,alpha=0.4,col=colors,positions=[1,2,3,4])
red_patch = Patch(color='red', label='Fromm')
blue_patch =Patch(color='blue', label='Schee')
green_patch = Patch(color='green', label='Neerincx')
yellow_patch = Patch(color='yellow', label='Selitsky')
ax2.legend(handles=[red_patch,blue_patch,green_patch,yellow_patch],fontsize='small')
ax3=fig.add_axes([0.61, 0.1, 0.35, 0.14])
ax3.plot([1,1,2,2],[4.5,4,4,4.5],'k')
ax3.text(1.5,3,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][4]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][0])),ha='center')
ax3.plot([2,2,3,3],[2.5,2,2,2.5],'k')
ax3.text(2.5,1,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][5]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][1])),ha='center')
ax3.plot([1,1,4,4],[0.5,0,0,0.5],'k')
ax3.text(2.5,-1,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][6]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][2])),ha='center')
ax3.plot([3,3,4,4],[-1.5,-2,-2,-1.5],'k')
ax3.text(3.5,-3,'FC = %.2f' % Decimal(str(mirnalist.T[mirna][7]))+', p = %.2e' % Decimal(str(mirnalist.T[mirna][3])),ha='center')
ax3.set_xlim(0.6, 4.4)
ax3.axis('off')
plt.show()
We are interested in the miRNA where the liver malign-colon malign ratio and the liver benign-colon malign ratio are both larger than 1 or both smaller than -1. In order to exclude cases that center around FC=1, higher cutoffs of 1.5 and -1.5, respectively, have been chosen.
As an additional filter, we only consider the ones where the change of colon malign to liver malign follow the same trend as liver benign to liver malign.
fco=1.5 #cutoff
query1='total_counts > 400 and tissue_mirna==False and ((fc_lm_cm < %f and fc_lb_cb > 1) or (fc_lm_cm > %f and fc_lb_cb < -1) or (fc_lm_cm < -1 and fc_lb_cb > %f) or (fc_lm_cm > 1 and fc_lb_cb < %f)) and fc_lm_cm*fc_lb_lm>0' % (-fco,fco,fco,-fco)
mirnalist.query(query1)
Now all these are plotted:
for item in list(mirnalist.query(query1).index):
combplot(item)
Now we consider the miRNA that are expressed stronger in one specific tissue, regardless of being malign or benign. Additionally, the relative change of liver expression should go in the opposite direction such that liver remnants alone cannot be the reason for the observed changes.
fco=1.5 #cutoff value
query2='total_counts > 400 and tissue_mirna==0 and ((fc_lm_cm > %f and fc_lb_cb > 1 and fc_lb_lm > 1) or (fc_lm_cm < %f and fc_lb_cb < -1 and fc_lb_lm < 1) or (fc_lm_cm > 1 and fc_lb_cb > %f and fc_lb_lm > 1) or (fc_lm_cm < -1 and fc_lb_cb < %f and fc_lb_lm < 1))' % (fco,-fco,fco,-fco)
mirnalist.query(query2)
for item in list(mirnalist.query(query2).index):
combplot(item)